{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import model as modeling\n",
    "import tensorflow as tf\n",
    "import tokenization\n",
    "import optimization\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = tokenization.FullTokenizer(\n",
    "    vocab_file = 'BERT.wordpiece', do_lower_case = False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rules import normalized_chars\n",
    "import random\n",
    "\n",
    "laughing = {\n",
    "    'huhu',\n",
    "    'haha',\n",
    "    'gagaga',\n",
    "    'hihi',\n",
    "    'wkawka',\n",
    "    'wkwk',\n",
    "    'kiki',\n",
    "    'keke',\n",
    "    'huehue',\n",
    "    'hshs',\n",
    "    'hoho',\n",
    "    'hewhew',\n",
    "    'uwu',\n",
    "    'sksk',\n",
    "    'ksks',\n",
    "    'gituu',\n",
    "    'gitu',\n",
    "    'mmeeooww',\n",
    "    'meow',\n",
    "    'alhamdulillah',\n",
    "    'muah',\n",
    "    'mmuahh',\n",
    "    'hehe',\n",
    "    'salamramadhan',\n",
    "    'happywomensday',\n",
    "    'jahagaha',\n",
    "    'ahakss',\n",
    "    'ahksk'\n",
    "}\n",
    "\n",
    "def make_cleaning(s, c_dict):\n",
    "    s = s.translate(c_dict)\n",
    "    return s\n",
    "\n",
    "def cleaning(string):\n",
    "    \"\"\"\n",
    "    use by any transformer model before tokenization\n",
    "    \"\"\"\n",
    "    string = unidecode(string)\n",
    "    \n",
    "    string = ' '.join(\n",
    "        [make_cleaning(w, normalized_chars) for w in string.split()]\n",
    "    )\n",
    "    string = re.sub('\\(dot\\)', '.', string)\n",
    "    string = (\n",
    "        re.sub(re.findall(r'\\<a(.*?)\\>', string)[0], '', string)\n",
    "        if (len(re.findall(r'\\<a (.*?)\\>', string)) > 0)\n",
    "        and ('href' in re.findall(r'\\<a (.*?)\\>', string)[0])\n",
    "        else string\n",
    "    )\n",
    "    string = re.sub(\n",
    "        r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', ' ', string\n",
    "    )\n",
    "    \n",
    "    chars = '.,/'\n",
    "    for c in chars:\n",
    "        string = string.replace(c, f' {c} ')\n",
    "        \n",
    "    string = re.sub(r'[ ]+', ' ', string).strip().split()\n",
    "    string = [w for w in string if w[0] != '@']\n",
    "    x = []\n",
    "    for word in string:\n",
    "        word = word.lower()\n",
    "        if any([laugh in word for laugh in laughing]):\n",
    "            if random.random() >= 0.5:\n",
    "                x.append(word)\n",
    "        else:\n",
    "            x.append(word)\n",
    "    string = [w.title() if w[0].isupper() else w for w in x]\n",
    "    return ' '.join(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/sentiment/news-sentiment/sentiment-data-v2.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from unidecode import unidecode\n",
    "import re\n",
    "\n",
    "df = pd.read_csv('sentiment-data-v2.csv')\n",
    "Y = LabelEncoder().fit_transform(df.label)\n",
    "\n",
    "texts = df.iloc[:,1].tolist()\n",
    "labels = Y.tolist()\n",
    "\n",
    "assert len(labels) == len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import json\n",
    "\n",
    "# with open('/home/husein/sentiment/strong-positives.json') as fopen:\n",
    "#     positives = json.load(fopen)\n",
    "#     positives = random.sample(positives, 500000)\n",
    "    \n",
    "# len(positives)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('/home/husein/sentiment/strong-negatives.json') as fopen:\n",
    "#     negatives = json.load(fopen)\n",
    "#     negatives = random.sample(negatives, 500000)\n",
    "    \n",
    "# len(negatives)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# texts += negatives\n",
    "# labels += [0] * len(negatives)\n",
    "# texts += positives\n",
    "# labels += [1] * len(positives)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "for i in tqdm(range(len(texts))):\n",
    "    texts[i] = cleaning(texts[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "actual_t, actual_l = [], []\n",
    "\n",
    "for i in tqdm(range(len(texts))):\n",
    "    if len(texts[i]) > 2:\n",
    "        actual_t.append(texts[i])\n",
    "        actual_l.append(labels[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "input_ids, input_masks = [], []\n",
    "\n",
    "for text in tqdm(actual_t):\n",
    "    tokens_a = tokenizer.tokenize(text)\n",
    "    tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n",
    "    input_id = tokenizer.convert_tokens_to_ids(tokens)\n",
    "    input_mask = [1] * len(input_id)\n",
    "    \n",
    "    input_ids.append(input_id)\n",
    "    input_masks.append(input_mask)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "epoch = 2\n",
    "batch_size = 60\n",
    "warmup_proportion = 0.1\n",
    "num_train_steps = int(len(texts) / batch_size * epoch)\n",
    "num_warmup_steps = int(num_train_steps * warmup_proportion)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_initializer(initializer_range=0.02):\n",
    "    return tf.truncated_normal_initializer(stddev=initializer_range)\n",
    "\n",
    "class Model:\n",
    "    def __init__(\n",
    "        self,\n",
    "        dimension_output,\n",
    "        learning_rate = 2e-5,\n",
    "        training = True,\n",
    "    ):\n",
    "        self.X = tf.placeholder(tf.int32, [None, None])\n",
    "        self.MASK = tf.placeholder(tf.int32, [None, None])\n",
    "        self.Y = tf.placeholder(tf.int32, [None])\n",
    "        \n",
    "        model = modeling.Model(\n",
    "            dim = 512, vocab_size = 32000, depth = 12, mlp_dim = 3072\n",
    "        )\n",
    "        sequence_output = model(\n",
    "            self.X, input_mask = self.MASK, training = training\n",
    "        )\n",
    "        \n",
    "        output_layer = sequence_output\n",
    "        output_layer = tf.layers.dense(\n",
    "            output_layer,\n",
    "            model.hidden_size,\n",
    "            activation=tf.tanh,\n",
    "            kernel_initializer=create_initializer())\n",
    "        self.logits_seq = tf.layers.dense(output_layer, dimension_output,\n",
    "                                         kernel_initializer=create_initializer())\n",
    "        self.logits_seq = tf.identity(self.logits_seq, name = 'logits_seq')\n",
    "        self.logits = self.logits_seq[:, 0]\n",
    "        self.logits = tf.identity(self.logits, name = 'logits')\n",
    "        \n",
    "        self.cost = tf.reduce_mean(\n",
    "            tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
    "                logits = self.logits, labels = self.Y\n",
    "            )\n",
    "        )\n",
    "        \n",
    "        self.optimizer = optimization.create_optimizer(self.cost, learning_rate, \n",
    "                                                       num_train_steps, num_warmup_steps, False)\n",
    "        correct_pred = tf.equal(\n",
    "            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y\n",
    "        )\n",
    "        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "INIT_CHKPNT = 'fnet-base/model.ckpt-475000'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dimension_output = 2\n",
    "learning_rate = 2e-5\n",
    "\n",
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model(\n",
    "    dimension_output,\n",
    "    learning_rate\n",
    ")\n",
    "\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "import re\n",
    "\n",
    "def get_assignment_map_from_checkpoint(tvars, init_checkpoint):\n",
    "    \"\"\"Compute the union of the current variables and checkpoint variables.\"\"\"\n",
    "    assignment_map = {}\n",
    "    initialized_variable_names = {}\n",
    "\n",
    "    name_to_variable = collections.OrderedDict()\n",
    "    for var in tvars:\n",
    "        name = var.name\n",
    "        m = re.match('^(.*):\\\\d+$', name)\n",
    "        if m is not None:\n",
    "            name = m.group(1)\n",
    "        name_to_variable[name] = var\n",
    "\n",
    "    init_vars = tf.train.list_variables(init_checkpoint)\n",
    "\n",
    "    assignment_map = collections.OrderedDict()\n",
    "    for x in init_vars:\n",
    "        (name, var) = (x[0], x[1])\n",
    "        if name not in name_to_variable:\n",
    "            continue\n",
    "        assignment_map[name] = name_to_variable[name]\n",
    "        initialized_variable_names[name] = 1\n",
    "        initialized_variable_names[name + ':0'] = 1\n",
    "\n",
    "    return (assignment_map, initialized_variable_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tvars = tf.trainable_variables()\n",
    "assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, \n",
    "                                                                                INIT_CHKPNT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "saver = tf.train.Saver(var_list = assignment_map)\n",
    "saver.restore(sess, INIT_CHKPNT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_input_ids, test_input_ids, train_Y, test_Y, train_mask, test_mask = train_test_split(\n",
    "    input_ids, actual_l[:len(input_ids)], input_masks, test_size = 0.2\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pad_sequences = tf.keras.preprocessing.sequence.pad_sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import time\n",
    "\n",
    "for EPOCH in range(10):\n",
    "\n",
    "    train_acc, train_loss, test_acc, test_loss = [], [], [], []\n",
    "    pbar = tqdm(\n",
    "        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'\n",
    "    )\n",
    "    for i in pbar:\n",
    "        index = min(i + batch_size, len(train_input_ids))\n",
    "        batch_x = train_input_ids[i: index]\n",
    "        batch_x = pad_sequences(batch_x, padding='post')\n",
    "        batch_mask = train_mask[i: index]\n",
    "        batch_mask = pad_sequences(batch_mask, padding='post')\n",
    "        batch_y = train_Y[i: index]\n",
    "        acc, cost, _ = sess.run(\n",
    "            [model.accuracy, model.cost, model.optimizer],\n",
    "            feed_dict = {\n",
    "                model.Y: batch_y,\n",
    "                model.X: batch_x,\n",
    "                model.MASK: batch_mask\n",
    "            },\n",
    "        )\n",
    "        train_loss.append(cost)\n",
    "        train_acc.append(acc)\n",
    "        pbar.set_postfix(cost = cost, accuracy = acc)\n",
    "        \n",
    "    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')\n",
    "    for i in pbar:\n",
    "        index = min(i + batch_size, len(test_input_ids))\n",
    "        batch_x = test_input_ids[i: index]\n",
    "        batch_x = pad_sequences(batch_x, padding='post')\n",
    "        batch_mask = test_mask[i: index]\n",
    "        batch_mask = pad_sequences(batch_mask, padding='post')\n",
    "        batch_y = test_Y[i: index]\n",
    "        acc, cost = sess.run(\n",
    "            [model.accuracy, model.cost],\n",
    "            feed_dict = {\n",
    "                model.Y: batch_y,\n",
    "                model.X: batch_x,\n",
    "                model.MASK: batch_mask\n",
    "            },\n",
    "        )\n",
    "        test_loss.append(cost)\n",
    "        test_acc.append(acc)\n",
    "        pbar.set_postfix(cost = cost, accuracy = acc)\n",
    "        \n",
    "    train_loss = np.mean(train_loss)\n",
    "    train_acc = np.mean(train_acc)\n",
    "    test_loss = np.mean(test_loss)\n",
    "    test_acc = np.mean(test_acc)\n",
    "    \n",
    "    print(\n",
    "        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\\n'\n",
    "        % (EPOCH, train_loss, train_acc, test_loss, test_acc)\n",
    "    )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
